More process building tools!

Use multiple feature extractors (on the same data), concatenate results.


In [ ]:
from sklearn.pipeline import make_union, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
import numpy as np

In [ ]:
from sklearn.datasets import fetch_20newsgroups

In [ ]:
news = fetch_20newsgroups()

In [ ]:
data, y = news.data, news.target

In [ ]:
from sklearn.cross_validation import train_test_split
data_train, data_test, y_train, y_test = train_test_split(data, y)

In [ ]:
char_and_word = make_union(CountVectorizer(analyzer="char"),
                           CountVectorizer(analyzer="word"))

text_pipe = make_pipeline(char_and_word, LinearSVC(dual=False))
param_grid = {'linearsvc__C': 10. ** np.arange(-3, 3)}

grid = GridSearchCV(text_pipe, param_grid=param_grid, cv=5, verbose=10)

In [ ]:
grid.fit(data_train, y_train)

In [ ]:
param_grid = {'featureunion__countvectorizer-1__ngram_range': [(1, 3), (1, 5), (2, 5)],
              'featureunion__countvectorizer-2__ngram_range': [(1, 1), (1, 2), (2, 2)],
              'linearsvc__C': 10. ** np.arange(-3, 3)}